In [410]:
try:
    #for data processing, data visualization preparation, data analysis 
    import pandas as pd
    import numpy as np
    
    #for data profiling
    from ydata_profiling import ProfileReport
    
    #for data visualization
    import matplotlib.pyplot as plt
    %matplotlib inline
    import seaborn as sns
    
    #for missing value visualization
    import missingno as msno
    
    #for data processing/manipulation
    import pyspark
    from pyspark.sql import SparkSession
    
    #for joining path to folder
    import os
    import datetime
except ImportError:
    !pip install pandas
    !pip install ydata_profiling
    !pip install numpy
    !pip install os
    !pip install matplotlib.pyplot
    !pip install seaborn
    !pip install missingno
    !pip install pyspark
    import pandas
    import ydata_profiling
    import numpy as np
    import os
    import matplotlib.pyplot as plt
    import seaborn as sns
    import missingno as msno
In [411]:
folder_path = r'/Users/torresliu/Desktop/Perpay/drive-download-20230516T193115Z-001'
user_file_path = os.path.join(folder_path, 'user_dataset.csv')
loan_file_path = os.path.join(folder_path, 'loan_dataset.csv')
user_df = pd.read_csv(user_file_path)
loan_df = pd.read_csv(loan_file_path)
In [412]:
user_df
Out[412]:
user_id signup_dt company_name spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts
0 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc 2020-10-13 00:06:21.603537 Other > Add a new company 1000.0 1 2022-11-22 01:21:08.832662 0 NaN NaN NaN NaN NaN
1 2752a3f2-0a96-42f9-bcc3-5fde7edec06b 2020-10-13 00:12:20.590843 Other > Add a new company 700.0 1 2020-12-31 01:50:17.564992 0 NaN 2020-12-31 01:50:18.097958 NaN NaN NaN
2 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 2020-10-13 00:15:26.586054 NaN NaN 0 2020-10-13 00:15:27.200942 0 NaN NaN NaN NaN NaN
3 0067c279-3b94-4867-ae51-aa182cfc8dc3 2020-10-13 00:17:16.041499 Other > Add a new company NaN 1 2020-10-27 15:27:35.952604 0 NaN NaN NaN NaN NaN
4 972e17fd-95b5-4bcc-8f17-c4323abd6c7b 2020-10-13 00:21:46.310764 NaN NaN 0 2020-10-13 00:21:46.398749 0 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
95570 9d562896-402f-43fb-b2e8-95674b59c00c 2020-10-19 00:03:58.651384 Vishay 1000.0 1 2023-01-16 18:52:38.283396 0 NaN 2022-11-20 16:59:13.045054 2022-11-20 17:02:13.393381 2022-11-20 17:02:15.975683 2022-11-23 13:49:14.122587
95571 2db26318-1e86-4611-8343-f22c2f52f700 2020-10-27 17:36:53.948341 Anthem, Inc. 800.0 1 2023-01-16 19:02:09.110595 0 2020-11-02 20:37:56.130803 2020-11-01 23:23:57.524518 2020-11-01 23:25:14.11145 2020-11-02 20:39:23.167071 2021-09-03 14:12:56.800775
95572 19444a27-e573-4b06-9ebf-f49a80cb27ba 2020-11-05 13:43:05.890775 Travelers Insurance 1000.0 1 2023-01-15 11:35:48.447004 0 2021-07-13 23:44:08.466518 2022-02-04 16:25:05.34794 2022-03-22 22:05:36.403078 2022-03-22 22:05:39.51998 2022-04-08 13:48:49.29766
95573 b2cf7001-2145-4653-a58f-26f658d6fed0 2020-11-01 22:49:39.533347 United States Army (Civilians) 1000.0 1 2022-12-19 13:58:21.389337 0 2020-11-20 23:54:27.624186 2020-11-19 19:16:24.802811 2020-11-20 23:45:58.574113 2020-11-23 14:18:21.049638 NaN
95574 a9b1e320-847f-438f-b9c6-d3b374810d6c 2020-10-24 15:36:18.286066 Indian Health Service 1000.0 1 2023-01-15 20:36:10.793236 0 NaN 2023-01-15 20:15:18.820641 2023-01-15 20:17:26.570373 2023-01-15 20:17:33.152737 2023-01-15 20:28:19.813885

95575 rows × 12 columns

User Data Profiling¶

In [413]:
user_df.head()
Out[413]:
user_id signup_dt company_name spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts
0 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc 2020-10-13 00:06:21.603537 Other > Add a new company 1000.0 1 2022-11-22 01:21:08.832662 0 NaN NaN NaN NaN NaN
1 2752a3f2-0a96-42f9-bcc3-5fde7edec06b 2020-10-13 00:12:20.590843 Other > Add a new company 700.0 1 2020-12-31 01:50:17.564992 0 NaN 2020-12-31 01:50:18.097958 NaN NaN NaN
2 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 2020-10-13 00:15:26.586054 NaN NaN 0 2020-10-13 00:15:27.200942 0 NaN NaN NaN NaN NaN
3 0067c279-3b94-4867-ae51-aa182cfc8dc3 2020-10-13 00:17:16.041499 Other > Add a new company NaN 1 2020-10-27 15:27:35.952604 0 NaN NaN NaN NaN NaN
4 972e17fd-95b5-4bcc-8f17-c4323abd6c7b 2020-10-13 00:21:46.310764 NaN NaN 0 2020-10-13 00:21:46.398749 0 NaN NaN NaN NaN NaN
In [414]:
user_df
Out[414]:
user_id signup_dt company_name spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts
0 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc 2020-10-13 00:06:21.603537 Other > Add a new company 1000.0 1 2022-11-22 01:21:08.832662 0 NaN NaN NaN NaN NaN
1 2752a3f2-0a96-42f9-bcc3-5fde7edec06b 2020-10-13 00:12:20.590843 Other > Add a new company 700.0 1 2020-12-31 01:50:17.564992 0 NaN 2020-12-31 01:50:18.097958 NaN NaN NaN
2 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 2020-10-13 00:15:26.586054 NaN NaN 0 2020-10-13 00:15:27.200942 0 NaN NaN NaN NaN NaN
3 0067c279-3b94-4867-ae51-aa182cfc8dc3 2020-10-13 00:17:16.041499 Other > Add a new company NaN 1 2020-10-27 15:27:35.952604 0 NaN NaN NaN NaN NaN
4 972e17fd-95b5-4bcc-8f17-c4323abd6c7b 2020-10-13 00:21:46.310764 NaN NaN 0 2020-10-13 00:21:46.398749 0 NaN NaN NaN NaN NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
95570 9d562896-402f-43fb-b2e8-95674b59c00c 2020-10-19 00:03:58.651384 Vishay 1000.0 1 2023-01-16 18:52:38.283396 0 NaN 2022-11-20 16:59:13.045054 2022-11-20 17:02:13.393381 2022-11-20 17:02:15.975683 2022-11-23 13:49:14.122587
95571 2db26318-1e86-4611-8343-f22c2f52f700 2020-10-27 17:36:53.948341 Anthem, Inc. 800.0 1 2023-01-16 19:02:09.110595 0 2020-11-02 20:37:56.130803 2020-11-01 23:23:57.524518 2020-11-01 23:25:14.11145 2020-11-02 20:39:23.167071 2021-09-03 14:12:56.800775
95572 19444a27-e573-4b06-9ebf-f49a80cb27ba 2020-11-05 13:43:05.890775 Travelers Insurance 1000.0 1 2023-01-15 11:35:48.447004 0 2021-07-13 23:44:08.466518 2022-02-04 16:25:05.34794 2022-03-22 22:05:36.403078 2022-03-22 22:05:39.51998 2022-04-08 13:48:49.29766
95573 b2cf7001-2145-4653-a58f-26f658d6fed0 2020-11-01 22:49:39.533347 United States Army (Civilians) 1000.0 1 2022-12-19 13:58:21.389337 0 2020-11-20 23:54:27.624186 2020-11-19 19:16:24.802811 2020-11-20 23:45:58.574113 2020-11-23 14:18:21.049638 NaN
95574 a9b1e320-847f-438f-b9c6-d3b374810d6c 2020-10-24 15:36:18.286066 Indian Health Service 1000.0 1 2023-01-15 20:36:10.793236 0 NaN 2023-01-15 20:15:18.820641 2023-01-15 20:17:26.570373 2023-01-15 20:17:33.152737 2023-01-15 20:28:19.813885

95575 rows × 12 columns

In [415]:
#length of the user dataset.
len(user_df)
Out[415]:
95575

User Data Profiling: Format types¶

In [416]:
# user data set column types
user_df.dtypes
Out[416]:
user_id                           object
signup_dt                         object
company_name                      object
spending_limit_est               float64
valid_phone_ind                    int64
last_login                        object
was_referred_ind                   int64
first_paystub_dt                  object
first_application_start_ts        object
first_application_complete_ts     object
first_awaiting_payment_ts         object
first_repayment_ts                object
dtype: object
In [417]:
# Convert the column to datetimein a loop
datetime_columns = [
    'signup_dt',
    'last_login',
    'first_paystub_dt',
    'first_application_start_ts',
    'first_application_complete_ts',
    'first_awaiting_payment_ts',
    'first_repayment_ts'
]

for column in datetime_columns:
    user_df[column] = pd.to_datetime(user_df[column])
In [418]:
user_df.dtypes
Out[418]:
user_id                                  object
signup_dt                        datetime64[ns]
company_name                             object
spending_limit_est                      float64
valid_phone_ind                           int64
last_login                       datetime64[ns]
was_referred_ind                          int64
first_paystub_dt                 datetime64[ns]
first_application_start_ts       datetime64[ns]
first_application_complete_ts    datetime64[ns]
first_awaiting_payment_ts        datetime64[ns]
first_repayment_ts               datetime64[ns]
dtype: object

From above, in the user dataset, I see that the timespamp columns type were object, and because I want to work with the timespamp column later in my analysis, I convert all timespamp columns to datetime.¶

In [419]:
# data dimension
user_df.shape
Out[419]:
(95575, 12)
In [420]:
# describe statistical information
user_df.describe()
#no user_id, no company name
Out[420]:
signup_dt spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts
count 95575 69055.000000 95575.000000 95575 95575.000000 8839 25105 18948 14388 1524
mean 2020-10-26 07:30:46.781417984 837.165216 0.813947 2021-03-01 14:29:40.952891136 0.035177 2021-02-20 00:47:41.967247872 2021-01-01 01:58:32.123276800 2021-01-09 17:34:31.836468224 2021-01-24 12:24:51.778903552 2021-04-23 19:11:41.864473856
min 2020-10-13 00:00:13.593771 50.000000 0.000000 2020-10-13 00:01:32.317156 0.000000 2016-11-12 00:40:01.316014 2016-11-12 00:22:57.724906 2016-11-12 00:40:24.715791 2016-11-12 18:17:14.283158 2016-11-30 21:17:45.514341
25% 2020-10-20 06:54:35.580967424 650.000000 1.000000 2020-10-24 22:18:45.041860864 0.000000 2020-10-26 18:07:49.435676928 2020-10-24 03:36:48.439672064 2020-10-24 23:20:00.373304832 2020-10-26 00:51:18.425989888 2020-11-13 15:04:34.914781440
50% 2020-10-27 01:36:18.374267904 1000.000000 1.000000 2020-11-02 08:15:36.039156992 0.000000 2020-11-05 02:37:25.778889984 2020-11-01 01:38:02.628940032 2020-11-02 02:40:46.823124992 2020-11-03 20:59:17.371486464 2020-12-04 11:47:32.950610688
75% 2020-11-01 05:30:42.165230592 1000.000000 1.000000 2021-01-10 07:02:38.536221952 0.000000 2021-01-17 04:38:59.484386560 2020-11-19 00:52:01.452822016 2020-11-30 19:18:59.148196608 2020-12-13 18:41:46.900191232 2021-09-17 14:33:02.806388992
max 2020-11-06 23:59:49.199581 1000.000000 1.000000 2023-01-16 19:26:10.352780 1.000000 2023-01-12 07:15:04.604709 2023-01-16 13:38:13.593884 2023-01-16 13:39:30.223596 2023-01-16 13:39:32.495406 2023-01-15 20:28:19.813885
std NaN 238.771310 0.389151 NaN 0.184227 NaN NaN NaN NaN NaN
In [421]:
# data information 
user_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 95575 entries, 0 to 95574
Data columns (total 12 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   user_id                        95575 non-null  object        
 1   signup_dt                      95575 non-null  datetime64[ns]
 2   company_name                   86027 non-null  object        
 3   spending_limit_est             69055 non-null  float64       
 4   valid_phone_ind                95575 non-null  int64         
 5   last_login                     95575 non-null  datetime64[ns]
 6   was_referred_ind               95575 non-null  int64         
 7   first_paystub_dt               8839 non-null   datetime64[ns]
 8   first_application_start_ts     25105 non-null  datetime64[ns]
 9   first_application_complete_ts  18948 non-null  datetime64[ns]
 10  first_awaiting_payment_ts      14388 non-null  datetime64[ns]
 11  first_repayment_ts             1524 non-null   datetime64[ns]
dtypes: datetime64[ns](7), float64(1), int64(2), object(2)
memory usage: 8.8+ MB

User Data Profiling: Missing Values¶

Calculating missing value counts for each field¶

Calculating missing value percentages for each field¶

In [422]:
null_df = pd.DataFrame({"Null Values": user_df.isnull().sum(),
                         "Percentage Null Values": (user_df.isnull().sum()) / (user_df.shape[0]) * 100
                         })

null_df
Out[422]:
Null Values Percentage Null Values
user_id 0 0.000000
signup_dt 0 0.000000
company_name 9548 9.990060
spending_limit_est 26520 27.747842
valid_phone_ind 0 0.000000
last_login 0 0.000000
was_referred_ind 0 0.000000
first_paystub_dt 86736 90.751766
first_application_start_ts 70470 73.732671
first_application_complete_ts 76627 80.174732
first_awaiting_payment_ts 81187 84.945854
first_repayment_ts 94051 98.405441

Missing Values Visualzation¶

In [423]:
user_df.columns
Out[423]:
Index(['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
       'valid_phone_ind', 'last_login', 'was_referred_ind', 'first_paystub_dt',
       'first_application_start_ts', 'first_application_complete_ts',
       'first_awaiting_payment_ts', 'first_repayment_ts'],
      dtype='object')
In [424]:
user_msno = user_df[['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
       'valid_phone_ind', 'last_login', 'was_referred_ind', 
       'first_application_start_ts', 'first_application_complete_ts',
       'first_awaiting_payment_ts','first_paystub_dt', 'first_repayment_ts']]
In [425]:
msno.bar(user_msno, color="blue")
plt.ylabel('Number of Non-null observation')
plt.show()

Comment: Look at the missing value visualization above, I see that there is gradual loss of loan application users as we move through the loan application funnel.¶

loan application funnel:¶

signup_dt --> first_application_start_ts --> first_application_complete_ts --> first_paystub_dt --> first_awaiting_payment_ts --> first_repayment_ts¶

User Data Profiling: Duplicates¶

In [426]:
user_df.columns
Out[426]:
Index(['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
       'valid_phone_ind', 'last_login', 'was_referred_ind', 'first_paystub_dt',
       'first_application_start_ts', 'first_application_complete_ts',
       'first_awaiting_payment_ts', 'first_repayment_ts'],
      dtype='object')
In [429]:
user_check_dup = user_df.groupby(['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
       'valid_phone_ind', 'last_login', 'was_referred_ind', 'first_paystub_dt',
       'first_application_start_ts', 'first_application_complete_ts',
       'first_awaiting_payment_ts', 'first_repayment_ts'])['user_id'].count().reset_index(name= 'Distinct Count')

user_check_dup
Out[429]:
user_id signup_dt company_name spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts Distinct Count
0 00319372-d927-4703-be31-fddddece5e2b 2020-11-05 06:11:15.104940 Prime Time Healthcare 1000.0 1 2022-12-29 03:57:49.027318 0 2022-01-12 07:19:36.984471 2021-10-29 19:08:25.690945 2021-10-29 19:11:39.803046 2021-10-29 19:11:41.996796 2021-10-31 01:12:14.848954 1
1 00ddddcc-92ba-4713-b05d-13bfef9a2a05 2020-11-05 18:42:21.699998 Armanino 1000.0 1 2021-12-23 16:40:17.025239 0 2020-11-10 20:44:24.712778 2020-11-10 20:39:20.460429 2020-11-10 20:40:08.711397 2020-11-10 20:40:10.964056 2020-11-27 12:33:47.994959 1
2 00fdb33d-f087-4d44-91cf-3befd30a3695 2020-10-19 15:38:37.476959 American Legion 1000.0 1 2022-06-16 21:31:42.888787 0 2022-04-01 14:11:21.857435 2022-01-01 18:01:19.270900 2022-03-28 18:20:48.502681 2022-03-28 18:20:54.735573 2022-04-01 13:57:10.741083 1
3 0111d661-f6b7-40c7-8428-782bc1f6119a 2020-10-28 02:15:59.849178 Wabash National 500.0 1 2022-09-15 07:52:11.456729 0 2020-11-17 10:39:10.340941 2020-10-28 02:25:12.614750 2020-10-28 02:26:26.883021 2020-10-28 02:26:28.746381 2020-11-17 14:31:26.991396 1
4 0146f581-0040-4788-8b59-86d4fd1c1420 2020-11-01 00:13:43.426700 Paychex 900.0 1 2023-01-06 17:02:46.054490 0 2022-04-12 14:38:50.242406 2022-04-12 14:31:05.766885 2022-04-12 14:34:10.977920 2022-04-14 17:46:04.790958 2022-05-06 13:36:09.062834 1
... ... ... ... ... ... ... ... ... ... ... ... ... ...
1005 fe462bd4-cdac-4ac6-94ae-32ad33400cbb 2020-10-20 07:49:30.621476 Other > Add a new company 1000.0 1 2023-01-15 19:16:08.722549 0 2022-12-14 16:46:12.381446 2022-12-14 16:40:20.428949 2022-12-14 16:41:29.299477 2022-12-14 17:08:33.492203 2022-12-30 17:54:35.823323 1
1006 fe49346f-fde7-4b8a-8e58-0eb6a4ff9299 2020-11-02 19:20:39.966030 New York MTA 1000.0 1 2021-01-27 05:22:54.315546 0 2020-11-12 23:37:25.031354 2020-11-02 23:30:16.385741 2020-11-02 23:31:08.784334 2020-11-02 23:31:10.276118 2020-12-10 14:41:31.242395 1
1007 ff61dff9-88e4-4a85-85b9-2a912f156dc9 2020-10-21 19:13:34.506929 Trio Home Care 1000.0 1 2021-01-21 11:08:41.278935 0 2020-10-21 19:55:14.276958 2020-10-21 19:46:36.655985 2020-10-21 19:50:58.185742 2020-10-21 19:51:00.430324 2020-10-29 13:42:10.992583 1
1008 ffa28d2d-9d27-4f9c-9485-1dd1ae86606e 2020-10-26 04:40:22.509667 McDonald's 950.0 1 2022-12-27 22:14:06.405406 0 2020-10-26 19:42:59.731914 2022-10-20 17:28:44.160388 2022-10-20 17:29:57.864907 2022-10-20 17:30:00.311319 2022-11-10 18:05:19.339468 1
1009 ffe974b2-6323-476a-82ca-97cbaf2c1b39 2020-11-02 03:19:22.098953 Hudwell Delivery 1000.0 1 2022-09-19 22:38:27.114806 0 2020-11-09 04:00:52.447193 2020-11-09 03:39:04.402847 2020-11-09 03:39:55.104229 2020-11-09 03:39:57.320469 2020-11-20 15:30:41.737221 1

1010 rows × 13 columns

In [431]:
user_check_dup[user_check_dup['Distinct Count']>1]
Out[431]:
user_id signup_dt company_name spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts Distinct Count
In [432]:
user_df.nunique()
Out[432]:
user_id                          95575
signup_dt                        95575
company_name                      7874
spending_limit_est                 134
valid_phone_ind                      2
last_login                       95575
was_referred_ind                     2
first_paystub_dt                  8839
first_application_start_ts       25105
first_application_complete_ts    18948
first_awaiting_payment_ts        14388
first_repayment_ts                1524
dtype: int64
In [433]:
user_df.duplicated().sum()
Out[433]:
0

User Data Profiling: Outliers¶

In [434]:
sns.boxplot(x = 'spending_limit_est', data = user_df)
#Get the current axes
ax = plt.gca()

#Set the x-axis limit to start from 0
ax.set_xlim(left = 0)

plt.title('Spending Limit Estimate Boxplot')
Out[434]:
Text(0.5, 1.0, 'Spending Limit Estimate Boxplot')

Comment: from above,the maximum spending limit is equal to the upper quartile, so I conclude that the spending limit of the user are capped at 1000 dollars and there are no suspicious outliers as spending limit being greater than 0 is normal in this application funnel.¶

Loan Data Profiling¶

In [435]:
loan_df
Out[435]:
loan_id user_id amount number_of_payments user_pinwheel_eligible_at_ap approval_type application_start_ts application_complete_ts awaiting_payment_ts repayment_ts canceled_ts cancellation_type risk_tier_at_uw
0 27f95ba4-e56f-4222-bc92-d73bf8d7d669 beca750a-2041-4c26-8154-6aa85ae9b245 471.48 8.0 0 NaN 2020-10-13 00:04:43.64499 2020-10-13 00:08:29.744502 NaN NaN 2020-11-03 06:00:09.020005 automated.verification T0
1 040d9651-3ef2-46ed-9e28-6ad12bf00585 3401d86c-898b-4d6f-ba67-08b2bc005d5f 309.98 8.0 0 NaN 2020-10-13 00:24:50.475201 NaN NaN NaN 2020-10-27 09:00:09.906958 automated.application_started T0
2 71a279d5-5bce-4ea2-9e98-ec9d994256b1 ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f 523.98 8.0 1 underwriting.auto.ftb_decision 2020-10-13 00:26:48.749278 2020-10-13 00:27:53.698868 2020-10-13 12:19:55.282708 NaN 2021-02-11 06:30:08.28502 automated.awaiting_payment T0
3 706b5235-8424-4e91-b813-328dc5603e8e 1fa64260-49bf-474e-8800-893c0c455a06 369.96 4.0 0 NaN 2020-10-13 01:08:06.763585 NaN NaN NaN 2020-10-27 09:00:08.296637 automated.application_started T0
4 d850645f-a7fe-4cd1-97db-8f1ac1364a67 90f815e9-c931-4d9e-aa79-f136bc5f227c 479.98 8.0 0 NaN 2020-10-13 01:10:40.530583 NaN NaN NaN 2020-10-27 09:00:08.090965 automated.application_started T0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
39007 c538afec-eabb-442c-b53f-0d8163ba679c ff61dff9-88e4-4a85-85b9-2a912f156dc9 887.89 8.0 0 underwriting.auto.ftb_decision 2020-10-24 04:55:58.074156 2020-10-24 04:56:44.713389 2020-10-24 04:56:45.707634 2020-10-29 13:42:10.992583 NaN NaN T0
39008 8b6b254d-4a62-4579-b640-6a17d61cc76e 880fb13c-70dc-4604-9199-6e158f7c4eda 847.63 16.0 1 underwriting.auto.ftb_decision 2020-10-27 16:55:35.479453 2020-10-27 16:55:59.061944 2020-10-27 16:55:59.529787 2020-11-06 17:03:15.729864 NaN NaN T0
39009 5e1b86e3-1f17-4413-9551-5207de5b29c5 fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 304.98 8.0 0 underwriting.auto.ftb_decision 2020-10-30 06:14:34.776961 2020-10-30 06:15:24.078435 2020-10-30 06:15:25.64894 2020-10-30 14:36:51.175908 NaN NaN T0
39010 601ee268-b2da-41ea-9470-0c2574e7410c 54612ae8-64d4-426e-8c22-57d2791a0b86 478.96 16.0 1 underwriting.auto.ftb_decision 2020-11-04 01:13:43.302878 2020-11-04 01:14:35.764894 2020-11-04 01:14:37.253225 2020-11-13 15:05:15.799022 NaN NaN T0
39011 da8f8586-6ac5-40c5-bdda-8e570151f35f 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 647.94 8.0 1 underwriting.auto.ftb_decision 2020-11-06 02:20:08.042178 2020-11-06 02:21:09.263684 2020-11-06 02:21:11.054734 2020-11-20 15:25:37.351017 NaN NaN T0

39012 rows × 13 columns

Loan Data Profiling: Format types¶

In [436]:
string = "This is displayed in your Big Black Console"
type(string)
Out[436]:
str
In [437]:
loan_df.dtypes
Out[437]:
loan_id                          object
user_id                          object
amount                          float64
number_of_payments              float64
user_pinwheel_eligible_at_ap      int64
approval_type                    object
application_start_ts             object
application_complete_ts          object
awaiting_payment_ts              object
repayment_ts                     object
canceled_ts                      object
cancellation_type                object
risk_tier_at_uw                  object
dtype: object
In [438]:
# Convert the columns to datetime in a loop
datetime_columns = [
                    'application_start_ts',
                    'application_complete_ts',
                    'awaiting_payment_ts',
                    'repayment_ts',
                    'repayment_ts',
                    'canceled_ts'
]

for column in datetime_columns:
    loan_df[column] = pd.to_datetime(loan_df[column])
In [439]:
# Convert the columns to string in a loop
datetime_columns = [
                    'approval_type',
                    'cancellation_type',
]

for column in datetime_columns:
    loan_df[column] = loan_df[column].astype(str)
In [440]:
loan_df.dtypes
Out[440]:
loan_id                                 object
user_id                                 object
amount                                 float64
number_of_payments                     float64
user_pinwheel_eligible_at_ap             int64
approval_type                           object
application_start_ts            datetime64[ns]
application_complete_ts         datetime64[ns]
awaiting_payment_ts             datetime64[ns]
repayment_ts                    datetime64[ns]
canceled_ts                     datetime64[ns]
cancellation_type                       object
risk_tier_at_uw                         object
dtype: object

From above, in the loan dataset, I see that the timespamp columns type were object, and because I want to work with the timespamp column later in my analysis, I convert all timespamp columns to datetime.¶

In [441]:
# data dimension
loan_df.shape #row less than user data set
Out[441]:
(39012, 13)
In [442]:
loan_df
Out[442]:
loan_id user_id amount number_of_payments user_pinwheel_eligible_at_ap approval_type application_start_ts application_complete_ts awaiting_payment_ts repayment_ts canceled_ts cancellation_type risk_tier_at_uw
0 27f95ba4-e56f-4222-bc92-d73bf8d7d669 beca750a-2041-4c26-8154-6aa85ae9b245 471.48 8.0 0 nan 2020-10-13 00:04:43.644990 2020-10-13 00:08:29.744502 NaT NaT 2020-11-03 06:00:09.020005 automated.verification T0
1 040d9651-3ef2-46ed-9e28-6ad12bf00585 3401d86c-898b-4d6f-ba67-08b2bc005d5f 309.98 8.0 0 nan 2020-10-13 00:24:50.475201 NaT NaT NaT 2020-10-27 09:00:09.906958 automated.application_started T0
2 71a279d5-5bce-4ea2-9e98-ec9d994256b1 ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f 523.98 8.0 1 underwriting.auto.ftb_decision 2020-10-13 00:26:48.749278 2020-10-13 00:27:53.698868 2020-10-13 12:19:55.282708 NaT 2021-02-11 06:30:08.285020 automated.awaiting_payment T0
3 706b5235-8424-4e91-b813-328dc5603e8e 1fa64260-49bf-474e-8800-893c0c455a06 369.96 4.0 0 nan 2020-10-13 01:08:06.763585 NaT NaT NaT 2020-10-27 09:00:08.296637 automated.application_started T0
4 d850645f-a7fe-4cd1-97db-8f1ac1364a67 90f815e9-c931-4d9e-aa79-f136bc5f227c 479.98 8.0 0 nan 2020-10-13 01:10:40.530583 NaT NaT NaT 2020-10-27 09:00:08.090965 automated.application_started T0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
39007 c538afec-eabb-442c-b53f-0d8163ba679c ff61dff9-88e4-4a85-85b9-2a912f156dc9 887.89 8.0 0 underwriting.auto.ftb_decision 2020-10-24 04:55:58.074156 2020-10-24 04:56:44.713389 2020-10-24 04:56:45.707634 2020-10-29 13:42:10.992583 NaT nan T0
39008 8b6b254d-4a62-4579-b640-6a17d61cc76e 880fb13c-70dc-4604-9199-6e158f7c4eda 847.63 16.0 1 underwriting.auto.ftb_decision 2020-10-27 16:55:35.479453 2020-10-27 16:55:59.061944 2020-10-27 16:55:59.529787 2020-11-06 17:03:15.729864 NaT nan T0
39009 5e1b86e3-1f17-4413-9551-5207de5b29c5 fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 304.98 8.0 0 underwriting.auto.ftb_decision 2020-10-30 06:14:34.776961 2020-10-30 06:15:24.078435 2020-10-30 06:15:25.648940 2020-10-30 14:36:51.175908 NaT nan T0
39010 601ee268-b2da-41ea-9470-0c2574e7410c 54612ae8-64d4-426e-8c22-57d2791a0b86 478.96 16.0 1 underwriting.auto.ftb_decision 2020-11-04 01:13:43.302878 2020-11-04 01:14:35.764894 2020-11-04 01:14:37.253225 2020-11-13 15:05:15.799022 NaT nan T0
39011 da8f8586-6ac5-40c5-bdda-8e570151f35f 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 647.94 8.0 1 underwriting.auto.ftb_decision 2020-11-06 02:20:08.042178 2020-11-06 02:21:09.263684 2020-11-06 02:21:11.054734 2020-11-20 15:25:37.351017 NaT nan T0

39012 rows × 13 columns

In [443]:
# describe statistical information
loan_df.describe()
Out[443]:
amount number_of_payments user_pinwheel_eligible_at_ap application_start_ts application_complete_ts awaiting_payment_ts repayment_ts canceled_ts
count 39012.000000 37066.000000 39012.000000 39012 27164 18275 1499 36295
mean 519.174718 9.621000 0.504768 2020-10-26 13:19:25.628109568 2020-10-26 17:44:53.409922560 2020-10-26 20:17:28.368069632 2020-11-10 16:48:34.291845632 2020-12-11 05:59:21.579855360
min 7.980000 1.000000 0.000000 2020-10-13 00:01:42.191775 2020-10-13 00:07:09.577877 2020-10-13 00:26:00.888018 2020-10-14 13:52:27.445060 2020-10-13 00:57:56.290905
25% 284.930000 8.000000 0.000000 2020-10-21 09:47:13.928946688 2020-10-21 14:35:26.136439296 2020-10-21 16:52:21.867469568 2020-10-30 14:40:59.422031872 2020-11-01 09:00:28.966358272
50% 479.980000 8.000000 1.000000 2020-10-27 13:22:13.275449088 2020-10-27 15:01:48.780180992 2020-10-27 16:16:25.904480 2020-11-06 17:04:05.031853056 2020-11-15 06:00:09.628438016
75% 709.980000 16.000000 1.000000 2020-10-31 16:30:10.752801280 2020-10-31 21:19:59.311596032 2020-10-31 21:41:29.274088448 2020-11-13 15:05:15.291198976 2021-02-14 06:30:43.603794432
max 2682.980000 36.000000 1.000000 2020-11-06 23:59:45.190724 2020-11-19 19:11:21.439778 2020-12-04 11:25:52.610301 2021-04-02 14:22:56.357633 2021-04-08 05:46:53.563057
std 315.473946 4.462309 0.499984 NaN NaN NaN NaN NaN
In [444]:
# data information 
loan_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39012 entries, 0 to 39011
Data columns (total 13 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   loan_id                       39012 non-null  object        
 1   user_id                       39012 non-null  object        
 2   amount                        39012 non-null  float64       
 3   number_of_payments            37066 non-null  float64       
 4   user_pinwheel_eligible_at_ap  39012 non-null  int64         
 5   approval_type                 39012 non-null  object        
 6   application_start_ts          39012 non-null  datetime64[ns]
 7   application_complete_ts       27164 non-null  datetime64[ns]
 8   awaiting_payment_ts           18275 non-null  datetime64[ns]
 9   repayment_ts                  1499 non-null   datetime64[ns]
 10  canceled_ts                   36295 non-null  datetime64[ns]
 11  cancellation_type             39012 non-null  object        
 12  risk_tier_at_uw               39012 non-null  object        
dtypes: datetime64[ns](5), float64(2), int64(1), object(5)
memory usage: 3.9+ MB

Loan Data Profiling: Missing Values¶

Calculating missing value counts for each field¶

Calculating missing value percentages for each field¶

In [445]:
loan_null_df = pd.DataFrame({"Null Values": loan_df.isnull().sum(),
                         "Percentage Null Values": (loan_df.isnull().sum()) / (loan_df.shape[0]) * 100
                         })

loan_null_df
Out[445]:
Null Values Percentage Null Values
loan_id 0 0.000000
user_id 0 0.000000
amount 0 0.000000
number_of_payments 1946 4.988209
user_pinwheel_eligible_at_ap 0 0.000000
approval_type 0 0.000000
application_start_ts 0 0.000000
application_complete_ts 11848 30.370143
awaiting_payment_ts 20737 53.155439
repayment_ts 37513 96.157593
canceled_ts 2717 6.964524
cancellation_type 0 0.000000
risk_tier_at_uw 0 0.000000

Missing Values Visualzation¶

In [446]:
loan_df.columns
Out[446]:
Index(['loan_id', 'user_id', 'amount', 'number_of_payments',
       'user_pinwheel_eligible_at_ap', 'approval_type', 'application_start_ts',
       'application_complete_ts', 'awaiting_payment_ts', 'repayment_ts',
       'canceled_ts', 'cancellation_type', 'risk_tier_at_uw'],
      dtype='object')
In [447]:
loan_df['approval_type'].unique()
Out[447]:
array(['nan', 'underwriting.auto.ftb_decision',
       'underwriting.manual.approve', 'underwriting.manual_review',
       'underwriting.manual.verification', 'underwriting.auto.deny',
       'underwriting.manual.deny', 'underwriting.manual.override_approve',
       'underwriting.auto.approve'], dtype=object)
In [448]:
loan_msno = loan_df[['loan_id', 'user_id', 'amount', 'number_of_payments',
       'user_pinwheel_eligible_at_ap', 'application_start_ts',
       'application_complete_ts', 'approval_type', 'awaiting_payment_ts' ,'repayment_ts',
       'canceled_ts', 'cancellation_type', 'risk_tier_at_uw']]
msno.bar(loan_msno, color="blue")

plt.show()

Comment: from above, this validates my previous conclusion that there is a gradual loss of loan application users as we move through the loan application funnel.¶

Loan Data Profiling: Duplicates¶

In [449]:
loan_df.duplicated().sum()
Out[449]:
0
In [450]:
loan_df.columns
Out[450]:
Index(['loan_id', 'user_id', 'amount', 'number_of_payments',
       'user_pinwheel_eligible_at_ap', 'approval_type', 'application_start_ts',
       'application_complete_ts', 'awaiting_payment_ts', 'repayment_ts',
       'canceled_ts', 'cancellation_type', 'risk_tier_at_uw'],
      dtype='object')
In [451]:
loan_df
Out[451]:
loan_id user_id amount number_of_payments user_pinwheel_eligible_at_ap approval_type application_start_ts application_complete_ts awaiting_payment_ts repayment_ts canceled_ts cancellation_type risk_tier_at_uw
0 27f95ba4-e56f-4222-bc92-d73bf8d7d669 beca750a-2041-4c26-8154-6aa85ae9b245 471.48 8.0 0 nan 2020-10-13 00:04:43.644990 2020-10-13 00:08:29.744502 NaT NaT 2020-11-03 06:00:09.020005 automated.verification T0
1 040d9651-3ef2-46ed-9e28-6ad12bf00585 3401d86c-898b-4d6f-ba67-08b2bc005d5f 309.98 8.0 0 nan 2020-10-13 00:24:50.475201 NaT NaT NaT 2020-10-27 09:00:09.906958 automated.application_started T0
2 71a279d5-5bce-4ea2-9e98-ec9d994256b1 ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f 523.98 8.0 1 underwriting.auto.ftb_decision 2020-10-13 00:26:48.749278 2020-10-13 00:27:53.698868 2020-10-13 12:19:55.282708 NaT 2021-02-11 06:30:08.285020 automated.awaiting_payment T0
3 706b5235-8424-4e91-b813-328dc5603e8e 1fa64260-49bf-474e-8800-893c0c455a06 369.96 4.0 0 nan 2020-10-13 01:08:06.763585 NaT NaT NaT 2020-10-27 09:00:08.296637 automated.application_started T0
4 d850645f-a7fe-4cd1-97db-8f1ac1364a67 90f815e9-c931-4d9e-aa79-f136bc5f227c 479.98 8.0 0 nan 2020-10-13 01:10:40.530583 NaT NaT NaT 2020-10-27 09:00:08.090965 automated.application_started T0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
39007 c538afec-eabb-442c-b53f-0d8163ba679c ff61dff9-88e4-4a85-85b9-2a912f156dc9 887.89 8.0 0 underwriting.auto.ftb_decision 2020-10-24 04:55:58.074156 2020-10-24 04:56:44.713389 2020-10-24 04:56:45.707634 2020-10-29 13:42:10.992583 NaT nan T0
39008 8b6b254d-4a62-4579-b640-6a17d61cc76e 880fb13c-70dc-4604-9199-6e158f7c4eda 847.63 16.0 1 underwriting.auto.ftb_decision 2020-10-27 16:55:35.479453 2020-10-27 16:55:59.061944 2020-10-27 16:55:59.529787 2020-11-06 17:03:15.729864 NaT nan T0
39009 5e1b86e3-1f17-4413-9551-5207de5b29c5 fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 304.98 8.0 0 underwriting.auto.ftb_decision 2020-10-30 06:14:34.776961 2020-10-30 06:15:24.078435 2020-10-30 06:15:25.648940 2020-10-30 14:36:51.175908 NaT nan T0
39010 601ee268-b2da-41ea-9470-0c2574e7410c 54612ae8-64d4-426e-8c22-57d2791a0b86 478.96 16.0 1 underwriting.auto.ftb_decision 2020-11-04 01:13:43.302878 2020-11-04 01:14:35.764894 2020-11-04 01:14:37.253225 2020-11-13 15:05:15.799022 NaT nan T0
39011 da8f8586-6ac5-40c5-bdda-8e570151f35f 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 647.94 8.0 1 underwriting.auto.ftb_decision 2020-11-06 02:20:08.042178 2020-11-06 02:21:09.263684 2020-11-06 02:21:11.054734 2020-11-20 15:25:37.351017 NaT nan T0

39012 rows × 13 columns

In [452]:
loan_df.groupby(['loan_id', 'user_id', 'amount', 'number_of_payments',
       'user_pinwheel_eligible_at_ap', 'approval_type', 'application_start_ts',
       'application_complete_ts', 'awaiting_payment_ts', 'repayment_ts',
        ])['user_id'].count().reset_index(name= 'Distinct Count')
Out[452]:
loan_id user_id amount number_of_payments user_pinwheel_eligible_at_ap approval_type application_start_ts application_complete_ts awaiting_payment_ts repayment_ts Distinct Count
0 00063e85-da3b-4e86-80be-fb08e65875c7 41659249-af7c-41cd-8dbf-3d0b2cb38a8c 81.98 8.0 1 underwriting.auto.ftb_decision 2020-10-23 00:52:17.494655 2020-10-23 00:53:47.084478 2020-10-23 00:53:48.970713 2020-10-30 14:41:03.110715 1
1 00100b15-07f9-4717-b456-bee53086dfc2 290e2109-dc8b-48ec-bb13-28fa6a97d152 559.98 8.0 1 underwriting.auto.ftb_decision 2020-10-26 16:42:57.134918 2020-10-26 16:44:11.731562 2020-10-26 16:44:13.787265 2020-11-25 14:52:42.767518 1
2 0032f35a-6822-4707-a912-56cb0a27bcf4 b6853856-c6b5-469d-b51b-0287628497fc 372.98 8.0 1 underwriting.auto.ftb_decision 2020-10-22 22:57:42.992983 2020-10-22 22:58:57.731909 2020-10-22 22:58:59.456508 2020-10-30 14:40:15.237530 1
3 00730e12-04ca-45dd-a8c7-8800cfb76ddc 13e58d18-208f-433b-b199-27dfdac1acb1 839.92 4.0 0 underwriting.auto.ftb_decision 2020-10-26 10:33:23.903355 2020-10-26 10:34:55.334010 2020-10-26 10:34:57.350641 2020-11-02 14:11:54.774641 1
4 007b93b8-b52f-46c8-9818-c4cc6e7d7aff dfb32ba5-6d43-45d9-b5f9-bdde4e42ac52 531.96 8.0 1 underwriting.auto.ftb_decision 2020-11-04 15:17:23.889207 2020-11-04 15:17:57.992434 2020-11-04 15:17:58.589918 2020-11-06 17:04:09.530275 1
... ... ... ... ... ... ... ... ... ... ... ...
1494 ffc96b15-4e32-425c-9a9d-8a44fd3c5a47 0ea245be-b7ce-4b61-9881-204b069021dc 212.98 16.0 1 underwriting.auto.ftb_decision 2020-11-02 00:00:14.242705 2020-11-02 00:02:21.840981 2020-11-02 00:02:24.306445 2020-11-06 16:59:54.164913 1
1495 ffcd70e3-6d73-4d0b-8d7a-60513e34b0b4 da2ad2c7-4764-44d3-8fa9-5383fceab818 569.98 8.0 1 underwriting.auto.ftb_decision 2020-11-06 19:47:36.811747 2020-11-06 19:47:45.782024 2020-11-06 20:29:35.276487 2020-11-20 15:26:49.611627 1
1496 ffdb7b76-a006-4e76-9c0b-abe87467a9f0 1507510f-4043-4913-a1d4-bd263fac3685 749.96 8.0 1 underwriting.auto.ftb_decision 2020-10-27 16:41:27.857770 2020-10-27 16:42:08.599788 2020-10-27 16:42:10.179225 2020-11-06 16:56:03.549415 1
1497 ffe68a35-bf91-43a8-bdd7-96a7bec7f258 b7781673-9c9e-4caa-83c6-39b7e8a237e0 686.92 8.0 1 underwriting.auto.ftb_decision 2020-10-14 15:56:15.302675 2020-10-14 17:02:40.962480 2020-10-14 17:02:42.831729 2020-11-06 17:03:00.221545 1
1498 ffe92b81-c940-4e1f-aacb-64faf27c15df 93916de2-8dbd-4a77-af9e-a916db9cfc37 86.98 8.0 1 underwriting.auto.ftb_decision 2020-10-13 01:20:02.370067 2020-10-13 01:20:26.633620 2020-10-13 01:20:31.748747 2020-10-23 13:59:02.709800 1

1499 rows × 11 columns

Loan Data Profiling: Outliers¶

In [453]:
sns.histplot(x = 'amount', data = loan_df)

## Calculate the lower and upper quantiles (e.g., 25th and 75th percentiles)
Q1 = np.percentile(loan_df['amount'],25)
Q3 = np.percentile(loan_df['amount'],75)

# Calculate the interquartile range (IQR)
IQR = Q3 - Q1

# Define threshold for identifying outliers (e.g., based on IQR)
lower_threshold = Q1 - 1.5*IQR
upper_threshold = Q3 +1.5*IQR

# Identify and mark outliers
outliers = [x for x in loan_df['amount'] if x < lower_threshold or x > upper_threshold]

#x represents each individual data point in the data list.
#The code checks if x is either less than the lower_threshold or greater than the upper_threshold. If either condition is true, it considers x as an outlier.
#If the condition is true for a particular x, it includes that x in the outliers list.

plt.scatter(outliers, [0] * len(outliers), color='red', marker='o', label='Outliers')

# Optional: Customize plot labels and title
plt.xlabel('Amount')
plt.ylabel('Frequency')
plt.title('Distribution of Amount with Outliers')

# Show the plot
plt.legend()
plt.show()
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):

Comment: this graph shows that the total dollar value of the loan are mostly in a range between 300 dollars to 700 dollars. I keep these outliers and their corresponding row of observation as they indicate that these loans are with large amount of dollar value of the loan.¶

Using data profiling package as a validation for previous data profiling.¶

In [454]:
user_profile = ProfileReport(user_df, title="User Profiling Report")
user_profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[454]:

In [455]:
loan_profile = ProfileReport(loan_df, title="Loan Profiling Report")
loan_profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[455]:

❖ Propose a set of metrics that can be used to monitor and improve the loan application funnel¶

Metric 1. Application Initiation Rate¶

In [456]:
user_df[['user_id','signup_dt']].nunique()
Out[456]:
user_id      95575
signup_dt    95575
dtype: int64
In [457]:
total_signup = len(user_df.groupby(['user_id', 'signup_dt'])['signup_dt'].count().reset_index(name='mycol'))
total_signup
Out[457]:
95575
In [458]:
total_app_start = user_df[['first_application_start_ts']].notna().sum()
total_app_start
Out[458]:
first_application_start_ts    25105
dtype: int64
In [459]:
total_app_Fstart = user_df[['first_application_start_ts']].isnull().sum()
total_app_Fstart
Out[459]:
first_application_start_ts    70470
dtype: int64
In [460]:
app_init_rate = total_app_start/total_signup *100
print(f"Application initiation rate is {app_init_rate.iloc[0].round(2)}")	
Application initiation rate is 26.27

Insights: This metric monitors and measures the percentage of potential borrowers who start the loan application process by checking out an item(s). From above, this rate indicates that a little over a quarter of visitors or potential borrowers start the loan application process. It can indicate the effectiveness of your marketing efforts in attracting potential applicants.¶

Actions/Research: We can investigate why the majority of visitors are not initiating applications. This might involve analyzing marketing strategies, understanding the target audience better, or assessing the initial presentation of loan products.¶

Metric 2. Application Completion Rate¶

In [461]:
total_app_compl = user_df[['first_application_complete_ts']].notna().sum()
total_app_compl
Out[461]:
first_application_complete_ts    18948
dtype: int64
In [462]:
total_app_Fcompl = user_df[['first_application_complete_ts']].isnull().sum()
total_app_Fcompl
Out[462]:
first_application_complete_ts    76627
dtype: int64
In [463]:
app_compl_rate = total_app_compl/total_signup *100
print(f"Application Completetion rate is {app_compl_rate.iloc[0].round(2)}")	
Application Completetion rate is 19.83

Insights: This metric tracks the percentage of applicants who sucessfully complete the loan application by providing identifying information. It helps assess the user experience and the ease of completing the application. From above less than 20% of those who start the application complete it, suggesting issues with the application process.¶

Actions/Research: We can examine the user experience of the application process. This could involve user testing to identify points of friction(any moment that creates difficulty, hesitation, or frustration for a user ), simplifying the application, or providing better guidance and information during the process.¶

Metric 3. Loan Approval Rate¶

In [464]:
total_approve = user_df[['first_awaiting_payment_ts']].notna().sum()
total_approve
Out[464]:
first_awaiting_payment_ts    14388
dtype: int64
In [465]:
total_Fapprove = user_df[['first_awaiting_payment_ts']].isnull().sum()
total_Fapprove
Out[465]:
first_awaiting_payment_ts    81187
dtype: int64
In [466]:
approval_rate = total_approve/total_signup *100
print(f"Loan approval rate is {approval_rate.iloc[0].round(2)}")	
Loan approval rate is 15.05

Insights: This metric calculates the percentage of completed loan applications that are approved. It reflects the Perpay(lender)'s underwriting criteria and risk assessment process. A higher approval rate may indicate a more inclusive lending policy. From above, about 15% of completed applications are approved, which could indicate stringent underwriting criteria or a mismatch between the applicants and the loan products.¶

Actions/Research: We can review underwriting criteria to ensure they align with the target market. Also,we can analyze the profiles of applicants to understand if the product is reaching the intended audience.¶

Metric 4. Payroll Direct Deposit Setup Rate¶

In [467]:
total_setup = user_df[['first_paystub_dt']].notna().sum()
total_setup
Out[467]:
first_paystub_dt    8839
dtype: int64
In [468]:
total_Fsetup = user_df[['first_paystub_dt']].isnull().sum()
total_Fsetup
Out[468]:
first_paystub_dt    86736
dtype: int64
In [469]:
setup_rate = total_setup/total_signup *100
print(f"Payroll direct deposite setup approval rate is {setup_rate.iloc[0].round(2)}")	
Payroll direct deposite setup approval rate is 9.25

Insights: This metrics measures the percentage of borrowers who sucessfully set up a payroll direct deposite based on the agreed terms of the approved loan application. It reflects the ability of borrowers to meet the require conditios. From here, we see a significant drop from approval to direct deposit setup suggests challenges in borrowers meeting post-approval requirements.¶

Actions/Research: We can investigate the reasons behind this drop. This might involve assessing the complexity of the direct deposit setup process, the clarity of communication regarding this requirement, or the financial stability of the approved borrowers.¶

Metric 5. Repayment Conversion Rate¶

In [470]:
total_repay = user_df[['first_repayment_ts']].notna().sum()
total_repay
Out[470]:
first_repayment_ts    1524
dtype: int64
In [471]:
total_Frepay = user_df[['first_repayment_ts']].isnull().sum()
total_Frepay
Out[471]:
first_repayment_ts    94051
dtype: int64
In [472]:
repay_rate = total_repay/total_signup *100
repay_rate.round(2)
print(f"repayment conversion rate is {repay_rate.iloc[0].round(2)}")	
repayment conversion rate is 1.59

Insights: This metric tracks the percentage of borrowers who enter the loan repayment phase after the first direct deposit payment has been made and the item(s) will be shipped. It show how successful the lending process is in converting borrowers into active, paying customers. A very low conversion rate into active repayment indicates major issues in the latter stages of the funnel.¶

Actions/Research: This requires urgent attention as this directly impacts revenue and cash flow. Investigate the factors leading to such low repayment. This might include analyzing the terms of the loans, the financial health of borrowers, or the effectiveness of the collection processes.¶

Metrics 6: Average time of filling application within 1 day¶

In [473]:
app_time_df = user_df

#Extract the day from the timestamps
app_time_df['day_difference'] = (user_df['first_application_complete_ts'] - user_df['first_application_start_ts']).dt.days

#Find the time takes to fill loan application
app_time_df['time_difference'] = (user_df['first_application_complete_ts'] - user_df['first_application_start_ts'])

# Average time of filling application within 1 day¶
avg_app_ts = app_time_df[app_time_df['day_difference']<1]['time_difference'].mean()
avg_app_ts

print(f"Average time of filling application within 1 day¶ is {avg_app_ts}")
Average time of filling application within 1 day¶ is 0 days 00:18:15.224973373

Assumptions: Thie metrics's time frame assumes a certain level of digital proficiency among users. if the target audience has less digital experence, this duration might be longer.¶

Insights: This duration suggests that the application is moderately complex. This time frame is reasonable for a detailed financial application but this could also indicate that there is room for improvement in building trust.¶

Actions/Research: we can conduct user experience research to understand if there are any specific parts of the application that take longer to complete. This could involve user testing or analayzing application progression data. Or, we can gather direct feedback from applicants on their experience. This could be done through surveys or interviews post-application to understand user perceptions and identify specific pain points.¶

Metric 7: referral rate, the percentage of referral puchases that make up total number of repayments.¶

In [474]:
refer_df = user_df[['was_referred_ind','first_repayment_ts']].dropna()
refer_df_2 = refer_df.groupby('was_referred_ind')['first_repayment_ts'].count().reset_index(name = 'number_of_refer')
refer_df_2
Out[474]:
was_referred_ind number_of_refer
0 0 1426
1 1 98
In [475]:
referral_rate =(refer_df_2['number_of_refer'].iloc[1]
 /(refer_df_2['number_of_refer'].iloc[1] + refer_df_2['number_of_refer'].iloc[0]) 
 *100).round(2)
referral_rate 
Out[475]:
6.43
In [476]:
refer_df = user_df.groupby('was_referred_ind')['user_id'].count().reset_index(name = 'number_of_refer')
refer_df
Out[476]:
was_referred_ind number_of_refer
0 0 92213
1 1 3362
In [477]:
(refer_df['number_of_refer'].iloc[1]
 /(refer_df['number_of_refer'].iloc[1] + refer_df['number_of_refer'].iloc[0]) 
 *100).round(2)
Out[477]:
3.52

Insights: This metrics measure the impact of the refer program on referee entering repayment. I did this by dividing the number of referees that result in a repayment by the total number of non-null repayment. There are 6.43% of referee that make up the total number of repayments. For your information, the software and digital goods generally have highest referral rate of all industries, at 4.75%. I consider that perpay is having a healthy refer program with 6.43% referral rate, and this means the refer program is having a significant impact on converting people to shop and buy goods using the repayments program in Perpay.¶

Actions/Research: Based on this data, I can see that there is room for improving the referral rate. We can do this by leveraging referal marketing incentives, like providing the referee cash, even 5 or 10 will nudge people to take actions, the points which can be redeedmed for discounts, and discounts of first purchases. Then, we can track our results, like impression rate: number of views for your referral landing page, response rate: who take action after viewing your referral offer, conversion Rate of Referral Purchases: This is a crucial metric. It indicates the percentage of referrals that actually result in a purchase.¶

Metric 7: Cancellation Type Count, interesting finding¶

In [478]:
cancel_df = loan_df
cancel_df.groupby('cancellation_type')['cancellation_type'].count().sort_values(ascending = False)
Out[478]:
cancellation_type
automated.awaiting_payment           9715
automated.application_started        7996
automated.pending                    4455
remorse                              3499
nan                                  2757
qualify_test                         2345
accident                             2065
other_interests                      1649
deposit_failure                      1229
didnt_realize_direct_deposit          688
automated.verification                563
purchased_elsewhere                   507
spending_limit_exceeded               505
magento.edit_order.cancel             437
duplicate_order                       418
1_covid_elevated_risk                 143
automated.deactivate_user_request      40
deny                                    1
Name: cancellation_type, dtype: int64

Insights: the analysis of cancellation type metrics based on the cancellation_type provides valuable insights into why customers are abandoning their loan applications. Above are the results, sorted by the most frequent reasons for cancellations.¶

Action/Research: We can address the most common reasons for cancellations, like 'automated.awaiting_payment'. Cancellations at this stage could imply issues with the payment setup or customer concerns about proceeding with the loan. Streamlining the payment process(providing clear information about payment terms might help). If we address the common cancel reasons, we can streamline the loan application process, enhance customer satisfaction, and potentially increase conversion rates.¶

❖ Do these datasets seem to agree with one another? Is there reason to be suspicious of the quality of the data?¶

Do these datasets seem to agree with one another?¶

1)Datasets agreement: Different dataset sizes¶

In [479]:
user_df
Out[479]:
user_id signup_dt company_name spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts day_difference time_difference
0 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc 2020-10-13 00:06:21.603537 Other > Add a new company 1000.0 1 2022-11-22 01:21:08.832662 0 NaT NaT NaT NaT NaT NaN NaT
1 2752a3f2-0a96-42f9-bcc3-5fde7edec06b 2020-10-13 00:12:20.590843 Other > Add a new company 700.0 1 2020-12-31 01:50:17.564992 0 NaT 2020-12-31 01:50:18.097958 NaT NaT NaT NaN NaT
2 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 2020-10-13 00:15:26.586054 NaN NaN 0 2020-10-13 00:15:27.200942 0 NaT NaT NaT NaT NaT NaN NaT
3 0067c279-3b94-4867-ae51-aa182cfc8dc3 2020-10-13 00:17:16.041499 Other > Add a new company NaN 1 2020-10-27 15:27:35.952604 0 NaT NaT NaT NaT NaT NaN NaT
4 972e17fd-95b5-4bcc-8f17-c4323abd6c7b 2020-10-13 00:21:46.310764 NaN NaN 0 2020-10-13 00:21:46.398749 0 NaT NaT NaT NaT NaT NaN NaT
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
95570 9d562896-402f-43fb-b2e8-95674b59c00c 2020-10-19 00:03:58.651384 Vishay 1000.0 1 2023-01-16 18:52:38.283396 0 NaT 2022-11-20 16:59:13.045054 2022-11-20 17:02:13.393381 2022-11-20 17:02:15.975683 2022-11-23 13:49:14.122587 0.0 0 days 00:03:00.348327
95571 2db26318-1e86-4611-8343-f22c2f52f700 2020-10-27 17:36:53.948341 Anthem, Inc. 800.0 1 2023-01-16 19:02:09.110595 0 2020-11-02 20:37:56.130803 2020-11-01 23:23:57.524518 2020-11-01 23:25:14.111450 2020-11-02 20:39:23.167071 2021-09-03 14:12:56.800775 0.0 0 days 00:01:16.586932
95572 19444a27-e573-4b06-9ebf-f49a80cb27ba 2020-11-05 13:43:05.890775 Travelers Insurance 1000.0 1 2023-01-15 11:35:48.447004 0 2021-07-13 23:44:08.466518 2022-02-04 16:25:05.347940 2022-03-22 22:05:36.403078 2022-03-22 22:05:39.519980 2022-04-08 13:48:49.297660 46.0 46 days 05:40:31.055138
95573 b2cf7001-2145-4653-a58f-26f658d6fed0 2020-11-01 22:49:39.533347 United States Army (Civilians) 1000.0 1 2022-12-19 13:58:21.389337 0 2020-11-20 23:54:27.624186 2020-11-19 19:16:24.802811 2020-11-20 23:45:58.574113 2020-11-23 14:18:21.049638 NaT 1.0 1 days 04:29:33.771302
95574 a9b1e320-847f-438f-b9c6-d3b374810d6c 2020-10-24 15:36:18.286066 Indian Health Service 1000.0 1 2023-01-15 20:36:10.793236 0 NaT 2023-01-15 20:15:18.820641 2023-01-15 20:17:26.570373 2023-01-15 20:17:33.152737 2023-01-15 20:28:19.813885 0.0 0 days 00:02:07.749732

95575 rows × 14 columns

In [480]:
#unique user_id in user dataset
len(user_df['user_id'].unique())
Out[480]:
95575
In [481]:
loan_df
Out[481]:
loan_id user_id amount number_of_payments user_pinwheel_eligible_at_ap approval_type application_start_ts application_complete_ts awaiting_payment_ts repayment_ts canceled_ts cancellation_type risk_tier_at_uw
0 27f95ba4-e56f-4222-bc92-d73bf8d7d669 beca750a-2041-4c26-8154-6aa85ae9b245 471.48 8.0 0 nan 2020-10-13 00:04:43.644990 2020-10-13 00:08:29.744502 NaT NaT 2020-11-03 06:00:09.020005 automated.verification T0
1 040d9651-3ef2-46ed-9e28-6ad12bf00585 3401d86c-898b-4d6f-ba67-08b2bc005d5f 309.98 8.0 0 nan 2020-10-13 00:24:50.475201 NaT NaT NaT 2020-10-27 09:00:09.906958 automated.application_started T0
2 71a279d5-5bce-4ea2-9e98-ec9d994256b1 ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f 523.98 8.0 1 underwriting.auto.ftb_decision 2020-10-13 00:26:48.749278 2020-10-13 00:27:53.698868 2020-10-13 12:19:55.282708 NaT 2021-02-11 06:30:08.285020 automated.awaiting_payment T0
3 706b5235-8424-4e91-b813-328dc5603e8e 1fa64260-49bf-474e-8800-893c0c455a06 369.96 4.0 0 nan 2020-10-13 01:08:06.763585 NaT NaT NaT 2020-10-27 09:00:08.296637 automated.application_started T0
4 d850645f-a7fe-4cd1-97db-8f1ac1364a67 90f815e9-c931-4d9e-aa79-f136bc5f227c 479.98 8.0 0 nan 2020-10-13 01:10:40.530583 NaT NaT NaT 2020-10-27 09:00:08.090965 automated.application_started T0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
39007 c538afec-eabb-442c-b53f-0d8163ba679c ff61dff9-88e4-4a85-85b9-2a912f156dc9 887.89 8.0 0 underwriting.auto.ftb_decision 2020-10-24 04:55:58.074156 2020-10-24 04:56:44.713389 2020-10-24 04:56:45.707634 2020-10-29 13:42:10.992583 NaT nan T0
39008 8b6b254d-4a62-4579-b640-6a17d61cc76e 880fb13c-70dc-4604-9199-6e158f7c4eda 847.63 16.0 1 underwriting.auto.ftb_decision 2020-10-27 16:55:35.479453 2020-10-27 16:55:59.061944 2020-10-27 16:55:59.529787 2020-11-06 17:03:15.729864 NaT nan T0
39009 5e1b86e3-1f17-4413-9551-5207de5b29c5 fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 304.98 8.0 0 underwriting.auto.ftb_decision 2020-10-30 06:14:34.776961 2020-10-30 06:15:24.078435 2020-10-30 06:15:25.648940 2020-10-30 14:36:51.175908 NaT nan T0
39010 601ee268-b2da-41ea-9470-0c2574e7410c 54612ae8-64d4-426e-8c22-57d2791a0b86 478.96 16.0 1 underwriting.auto.ftb_decision 2020-11-04 01:13:43.302878 2020-11-04 01:14:35.764894 2020-11-04 01:14:37.253225 2020-11-13 15:05:15.799022 NaT nan T0
39011 da8f8586-6ac5-40c5-bdda-8e570151f35f 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 647.94 8.0 1 underwriting.auto.ftb_decision 2020-11-06 02:20:08.042178 2020-11-06 02:21:09.263684 2020-11-06 02:21:11.054734 2020-11-20 15:25:37.351017 NaT nan T0

39012 rows × 13 columns

In [482]:
#unique loan_id in loan dataset
len(loan_df['user_id'].unique())
Out[482]:
31656
In [ ]:
 

Different Dataset Sizes: these datasets seem to agree with one another that not all users who sign up will apply for or take out a loan., because the user_df dataset has a larger number of unique user_id values compared to the loan_df dataset. This is because the user_df includes all users who have signed up for an account or service, while loan_df includes only those who have actually applied for or taken out a loan.¶

2)Datasets disagreement, and suscpcion of the quality of the data: Mismatching in user_id¶

In [483]:
user_appstart = user_df[['user_id','first_application_start_ts']]
user_appstart = user_appstart.dropna()
user_appstart
Out[483]:
user_id first_application_start_ts
1 2752a3f2-0a96-42f9-bcc3-5fde7edec06b 2020-12-31 01:50:18.097958
5 f25d9c31-0a14-434a-adc7-f812685ba9c2 2020-10-13 00:30:38.930504
8 c1651717-68e9-40da-88b6-34ac52261114 2020-10-22 16:21:01.047124
15 f7424169-db78-4451-89a4-b74ebbbc7b03 2020-10-13 02:17:57.782537
16 cb022b83-df8c-4173-bfd0-f6a44922b370 2020-10-13 01:49:45.691136
... ... ...
95570 9d562896-402f-43fb-b2e8-95674b59c00c 2022-11-20 16:59:13.045054
95571 2db26318-1e86-4611-8343-f22c2f52f700 2020-11-01 23:23:57.524518
95572 19444a27-e573-4b06-9ebf-f49a80cb27ba 2022-02-04 16:25:05.347940
95573 b2cf7001-2145-4653-a58f-26f658d6fed0 2020-11-19 19:16:24.802811
95574 a9b1e320-847f-438f-b9c6-d3b374810d6c 2023-01-15 20:15:18.820641

25105 rows × 2 columns

In [484]:
loan_df[['user_id','application_start_ts']]
Out[484]:
user_id application_start_ts
0 beca750a-2041-4c26-8154-6aa85ae9b245 2020-10-13 00:04:43.644990
1 3401d86c-898b-4d6f-ba67-08b2bc005d5f 2020-10-13 00:24:50.475201
2 ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f 2020-10-13 00:26:48.749278
3 1fa64260-49bf-474e-8800-893c0c455a06 2020-10-13 01:08:06.763585
4 90f815e9-c931-4d9e-aa79-f136bc5f227c 2020-10-13 01:10:40.530583
... ... ...
39007 ff61dff9-88e4-4a85-85b9-2a912f156dc9 2020-10-24 04:55:58.074156
39008 880fb13c-70dc-4604-9199-6e158f7c4eda 2020-10-27 16:55:35.479453
39009 fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 2020-10-30 06:14:34.776961
39010 54612ae8-64d4-426e-8c22-57d2791a0b86 2020-11-04 01:13:43.302878
39011 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 2020-11-06 02:20:08.042178

39012 rows × 2 columns

In [485]:
# Perform an inner join in pandas
merged_inner = pd.merge(user_appstart, loan_df, on='user_id', how='inner')
merged_inner
Out[485]:
user_id first_application_start_ts loan_id amount number_of_payments user_pinwheel_eligible_at_ap approval_type application_start_ts application_complete_ts awaiting_payment_ts repayment_ts canceled_ts cancellation_type risk_tier_at_uw
0 f25d9c31-0a14-434a-adc7-f812685ba9c2 2020-10-13 00:30:38.930504 1e1321c0-c3ac-413c-93eb-829753e18357 485.94 8.0 0 nan 2020-10-13 00:30:38.930504 NaT NaT NaT 2020-10-27 09:00:09.552342 automated.application_started T0
1 c1651717-68e9-40da-88b6-34ac52261114 2020-10-22 16:21:01.047124 cb434a06-ffe1-4f13-a776-f320c1f32d65 100.98 NaN 0 nan 2020-10-22 16:21:01.047124 NaT NaT NaT 2020-11-06 10:00:56.076354 automated.application_started T0
2 f7424169-db78-4451-89a4-b74ebbbc7b03 2020-10-13 02:17:57.782537 fdd339c4-3702-41e8-91fe-1aad6e17bd57 528.92 8.0 0 nan 2020-10-13 02:17:57.782537 NaT NaT NaT 2020-10-27 09:00:16.373991 automated.application_started T0
3 cb022b83-df8c-4173-bfd0-f6a44922b370 2020-10-13 01:49:45.691136 c952956e-144c-40fe-b5d6-bc84e4dbe37a 262.98 4.0 0 nan 2020-10-13 01:49:45.691136 2020-10-19 18:20:01.904799 NaT NaT 2020-10-19 18:24:04.161829 remorse T0
4 957bd9b4-a70b-4eba-94b0-b75fd39e0635 2020-10-13 03:22:20.637457 6c4945bf-ed63-41ff-8fb8-641cb53310f3 953.19 16.0 1 underwriting.auto.ftb_decision 2020-10-13 03:22:20.637457 2020-10-13 03:25:10.487610 2020-10-13 11:47:52.548097 NaT 2021-02-11 06:30:09.929995 automated.awaiting_payment T0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
20873 4d94cbb8-5cba-47b2-a8fa-bba98badadca 2020-10-28 17:37:04.274041 ffd85f51-9e03-4202-ab71-ee06a5438c6e 569.98 16.0 1 underwriting.auto.ftb_decision 2020-10-30 22:32:19.815574 2020-10-30 22:32:54.834815 2020-10-30 22:32:55.437703 NaT 2021-02-28 06:31:23.386452 automated.awaiting_payment T0
20874 b411e962-2255-467d-b97f-65ea0d8f771d 2020-10-24 04:44:31.258199 a6ce21f0-bd8c-4175-b514-9f6b3173569a 382.98 8.0 1 underwriting.auto.ftb_decision 2020-10-24 04:49:58.575089 2020-10-24 04:51:16.920086 2020-10-24 04:51:19.087176 2020-11-12 14:33:42.146713 NaT nan T0
20875 b411e962-2255-467d-b97f-65ea0d8f771d 2020-10-24 04:44:31.258199 f9eb9cb6-79e9-4d61-bed9-a6fa1a4813d5 839.98 8.0 1 nan 2020-10-24 04:44:31.258199 NaT NaT NaT 2020-10-24 04:46:54.132675 remorse T0
20876 e691a8bc-e598-4077-ada7-8da2c7bd6dca 2020-10-23 23:30:21.399193 e2b80d0e-3536-46b5-92c1-927b59f4ccbd 678.52 8.0 1 underwriting.auto.ftb_decision 2020-10-23 23:30:21.399193 2020-10-23 23:31:05.948503 2020-10-23 23:31:07.998526 NaT 2021-02-21 06:31:19.630065 automated.awaiting_payment T0
20877 2db26318-1e86-4611-8343-f22c2f52f700 2020-11-01 23:23:57.524518 3f4fc6a0-566c-4fca-8d7b-614b663f75f8 509.98 8.0 0 underwriting.auto.ftb_decision 2020-11-01 23:23:57.524518 2020-11-01 23:25:14.111450 2020-11-02 20:39:23.167071 NaT 2020-12-04 20:32:11.484867 other_interests T0

20878 rows × 14 columns

In [486]:
# Check if the number of unique users in the merged dataframe is equal to that in the loan_df
len(merged_inner['user_id'].unique()) == len(loan_df['user_id'].unique())
Out[486]:
False
In [487]:
len(merged_inner['user_id'].unique())
Out[487]:
17042
In [488]:
len(loan_df['user_id'].unique())
Out[488]:
31656

Mismatch in User IDs: There are users in the loan dataset that don't have a corresponding entry in the user dataset.¶

Specifically, there are 31,656 - 17,042 = 14,614 user_id values in loan_df that aren't matched with user_df. This is a significant number and suggests a potential issue with data integrity, which means that some users were not included in the user dataset.¶

3) Datasets disagreement, suscpcion of the quality of the data: data error¶

In [489]:
# describe statistical information
user_df.describe()
#no user_id, no company name
Out[489]:
signup_dt spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts day_difference time_difference
count 95575 69055.000000 95575.000000 95575 95575.000000 8839 25105 18948 14388 1524 18948.000000 18948
mean 2020-10-26 07:30:46.781417984 837.165216 0.813947 2021-03-01 14:29:40.952891136 0.035177 2021-02-20 00:47:41.967247872 2021-01-01 01:58:32.123276800 2021-01-09 17:34:31.836468224 2021-01-24 12:24:51.778903552 2021-04-23 19:11:41.864473856 11.244459 11 days 07:05:48.874714700
min 2020-10-13 00:00:13.593771 50.000000 0.000000 2020-10-13 00:01:32.317156 0.000000 2016-11-12 00:40:01.316014 2016-11-12 00:22:57.724906 2016-11-12 00:40:24.715791 2016-11-12 18:17:14.283158 2016-11-30 21:17:45.514341 0.000000 0 days 00:00:12.098707
25% 2020-10-20 06:54:35.580967424 650.000000 1.000000 2020-10-24 22:18:45.041860864 0.000000 2020-10-26 18:07:49.435676928 2020-10-24 03:36:48.439672064 2020-10-24 23:20:00.373304832 2020-10-26 00:51:18.425989888 2020-11-13 15:04:34.914781440 0.000000 0 days 00:00:53.940616750
50% 2020-10-27 01:36:18.374267904 1000.000000 1.000000 2020-11-02 08:15:36.039156992 0.000000 2020-11-05 02:37:25.778889984 2020-11-01 01:38:02.628940032 2020-11-02 02:40:46.823124992 2020-11-03 20:59:17.371486464 2020-12-04 11:47:32.950610688 0.000000 0 days 00:01:15.089982
75% 2020-11-01 05:30:42.165230592 1000.000000 1.000000 2021-01-10 07:02:38.536221952 0.000000 2021-01-17 04:38:59.484386560 2020-11-19 00:52:01.452822016 2020-11-30 19:18:59.148196608 2020-12-13 18:41:46.900191232 2021-09-17 14:33:02.806388992 0.000000 0 days 00:02:19.709684
max 2020-11-06 23:59:49.199581 1000.000000 1.000000 2023-01-16 19:26:10.352780 1.000000 2023-01-12 07:15:04.604709 2023-01-16 13:38:13.593884 2023-01-16 13:39:30.223596 2023-01-16 13:39:32.495406 2023-01-15 20:28:19.813885 1246.000000 1246 days 03:43:44.088223
std NaN 238.771310 0.389151 NaN 0.184227 NaN NaN NaN NaN NaN 74.546476 74 days 14:45:03.149283250
In [490]:
# Create a boolean mask for the constraint
constraint_mask = (
    (user_df['signup_dt'] < user_df['first_application_start_ts']) &
    (user_df['first_application_start_ts'] < user_df['first_application_complete_ts']) &
    (user_df['first_application_complete_ts'] < user_df['first_awaiting_payment_ts']) &
    (user_df['first_awaiting_payment_ts'] < user_df['first_repayment_ts']) &
    (user_df['spending_limit_est'] > 0)  # Assuming that spending limit should be greater than 0
)

# Apply the mask to filter rows that satisfy the constraint
filtered_user_df = user_df[constraint_mask]
filtered_user_df.shape
Out[490]:
(1496, 14)
In [491]:
user_df[user_df['first_paystub_dt'] == '2016-11-12 00:40:01.316014']
Out[491]:
user_id signup_dt company_name spending_limit_est valid_phone_ind last_login was_referred_ind first_paystub_dt first_application_start_ts first_application_complete_ts first_awaiting_payment_ts first_repayment_ts day_difference time_difference
236 bc5b5d91-1118-47fc-895e-91cc709e7e5a 2020-10-14 00:16:44.491340 United States Postal Service 1000.0 1 2022-10-26 00:43:13.947191 1 2016-11-12 00:40:01.316014 2016-11-12 00:22:57.724906 2016-11-12 00:40:24.715791 2016-11-12 18:17:14.283158 2016-11-30 21:17:45.514341 0.0 0 days 00:17:26.990885

From above, I see that users sign up time are from 2020-10-26 to 2020-11-06, it is not possible to see people completing the funnel before the sign up¶

e.g. user_id: "bc5b5d91-1118-47fc-895e-91cc709e7e5a", time stamp of the first time the user started a loan's application process, or first time complete the application process ,or first paystub upload time, or first time awaiting payment, or first repayment time are less than first sign up time.¶

the correct order of the loan application funnel is¶

❖ Start a loan application by checking out on an item(s) 
❖ Complete an application by providing some identifying information 
❖ Approval/Denial of the loan application 
❖ Set up a payroll direct deposit based on the agreed terms of the approved loan application 
❖ Loan enters repayment after the first direct deposit payment has been made and the item(s) will be shipped 

signup_dt < first_application_start_ts < first_application_complete_ts < first_paystub_dt < first_awaiting_payment_ts < first_repayment_ts¶

so, any row of observation that are not in this order maybe deemed invalid.¶

How many loans enter repayment within 15 days of approval?¶

from user datasets¶

In [492]:
user_df.columns
Out[492]:
Index(['user_id', 'signup_dt', 'company_name', 'spending_limit_est',
       'valid_phone_ind', 'last_login', 'was_referred_ind', 'first_paystub_dt',
       'first_application_start_ts', 'first_application_complete_ts',
       'first_awaiting_payment_ts', 'first_repayment_ts', 'day_difference',
       'time_difference'],
      dtype='object')
In [493]:
A_repay = user_df[['user_id','first_awaiting_payment_ts','first_repayment_ts']]
A_repay = A_repay[A_repay['first_awaiting_payment_ts'].notnull()]
A_repay
Out[493]:
user_id first_awaiting_payment_ts first_repayment_ts
22 957bd9b4-a70b-4eba-94b0-b75fd39e0635 2020-10-13 11:47:52.548097 NaT
39 020d9f88-5cae-434a-83b6-d268b1c51276 2020-10-15 15:22:54.337590 NaT
46 da60418c-b6a8-4d6b-ba74-e7015788e099 2021-01-27 00:03:31.446748 NaT
53 d7335de9-e9e9-4eba-9909-bf7dbaff54e5 2020-10-15 06:23:24.747991 2020-10-23 13:59:24.148156
59 b90038d6-cf39-4fc5-b5a9-cefe597e3315 2022-02-26 06:02:31.446417 NaT
... ... ... ...
95570 9d562896-402f-43fb-b2e8-95674b59c00c 2022-11-20 17:02:15.975683 2022-11-23 13:49:14.122587
95571 2db26318-1e86-4611-8343-f22c2f52f700 2020-11-02 20:39:23.167071 2021-09-03 14:12:56.800775
95572 19444a27-e573-4b06-9ebf-f49a80cb27ba 2022-03-22 22:05:39.519980 2022-04-08 13:48:49.297660
95573 b2cf7001-2145-4653-a58f-26f658d6fed0 2020-11-23 14:18:21.049638 NaT
95574 a9b1e320-847f-438f-b9c6-d3b374810d6c 2023-01-15 20:17:33.152737 2023-01-15 20:28:19.813885

14388 rows × 3 columns

In [494]:
A_num_loan = A_repay[['first_repayment_ts']].notnull().sum().iloc[0]
print(f"There are {A_num_loan} loans enter repayment of approvals.")	
There are 1524 loans enter repayment of approvals.
In [495]:
A_repayN = A_repay.dropna()
# Create an independent copy of the DataFrame slice
#By using A_repayN.copy(), you create a new DataFrame that is independent of the original one.
#Any modifications you make to this copy won't trigger the SettingWithCopyWarning, 
#as it's clear to Pandas that A_repayN is a standalone DataFrame.
A_repayN = A_repayN.copy()

# Now perform your operation
A_repayN['timediff'] = A_repayN['first_repayment_ts'] - A_repayN['first_awaiting_payment_ts']
A_repayN['daysdiff'] =A_repayN['timediff'].dt.days
A_repay_result = A_repayN[A_repayN['daysdiff']<=15].reset_index()
A_repay_result
Out[495]:
index user_id first_awaiting_payment_ts first_repayment_ts timediff daysdiff
0 53 d7335de9-e9e9-4eba-9909-bf7dbaff54e5 2020-10-15 06:23:24.747991 2020-10-23 13:59:24.148156 8 days 07:35:59.400165 8
1 334 50fdac43-85ba-4bae-9ed2-1de7180e367e 2020-11-16 11:25:29.719128 2020-11-25 14:52:38.530067 9 days 03:27:08.810939 9
2 370 298c7fd1-aa16-49b8-a9bc-f9d270a5f6b1 2020-10-14 15:23:23.809563 2020-10-22 13:35:26.253276 7 days 22:12:02.443713 7
3 753 f9132e9e-15c1-48b4-aa12-c30a1104bb82 2020-12-19 21:24:39.993132 2020-12-24 15:10:36.806978 4 days 17:45:56.813846 4
4 840 13e58d18-208f-433b-b199-27dfdac1acb1 2020-10-26 10:34:57.350641 2020-11-02 14:11:54.774641 7 days 03:36:57.424000 7
... ... ... ... ... ... ...
809 95558 58c5d22c-f5ff-4af9-99b7-10b32e6f2c34 2021-09-30 07:35:04.399192 2021-10-02 17:05:03.733286 2 days 09:29:59.334094 2
810 95561 2b5ceb0a-8a9f-46c5-9745-87ab0ec794e9 2022-11-27 01:33:17.896867 2022-12-07 13:06:32.459778 10 days 11:33:14.562911 10
811 95568 8a686a7a-cb47-489f-99e6-50bfd4755601 2022-09-28 14:01:48.565190 2022-10-14 13:57:22.435704 15 days 23:55:33.870514 15
812 95570 9d562896-402f-43fb-b2e8-95674b59c00c 2022-11-20 17:02:15.975683 2022-11-23 13:49:14.122587 2 days 20:46:58.146904 2
813 95574 a9b1e320-847f-438f-b9c6-d3b374810d6c 2023-01-15 20:17:33.152737 2023-01-15 20:28:19.813885 0 days 00:10:46.661148 0

814 rows × 6 columns

In [496]:
print(f"There are {len(A_repay_result)} loans enter repayment within 15 days of approvals.")	
There are 814 loans enter repayment within 15 days of approvals.

from loan datasets¶

In [497]:
loan_df.columns
Out[497]:
Index(['loan_id', 'user_id', 'amount', 'number_of_payments',
       'user_pinwheel_eligible_at_ap', 'approval_type', 'application_start_ts',
       'application_complete_ts', 'awaiting_payment_ts', 'repayment_ts',
       'canceled_ts', 'cancellation_type', 'risk_tier_at_uw'],
      dtype='object')
In [498]:
loan_df['approval_type'].unique()
Out[498]:
array(['nan', 'underwriting.auto.ftb_decision',
       'underwriting.manual.approve', 'underwriting.manual_review',
       'underwriting.manual.verification', 'underwriting.auto.deny',
       'underwriting.manual.deny', 'underwriting.manual.override_approve',
       'underwriting.auto.approve'], dtype=object)
In [499]:
loan_df['awaiting_payment_ts'].notnull().sum()
Out[499]:
18275
In [500]:
loan_df['approval_type'].notnull().sum()
Out[500]:
39012
In [501]:
B_repay = loan_df[['loan_id','awaiting_payment_ts','repayment_ts']]
B_repay = B_repay[B_repay['awaiting_payment_ts'].notnull()]
B_repay
Out[501]:
loan_id awaiting_payment_ts repayment_ts
2 71a279d5-5bce-4ea2-9e98-ec9d994256b1 2020-10-13 12:19:55.282708 NaT
8 e8998cbb-37ab-4180-93e9-d22b7855665f 2020-10-13 02:41:12.924726 NaT
9 0004ff34-d655-4f25-9104-6e108f454a13 2020-10-13 02:59:35.127466 NaT
11 6c4945bf-ed63-41ff-8fb8-641cb53310f3 2020-10-13 11:47:52.548097 NaT
20 471cb918-697c-4cb9-8846-fb075000b6d0 2020-10-13 06:01:33.888217 NaT
... ... ... ...
39007 c538afec-eabb-442c-b53f-0d8163ba679c 2020-10-24 04:56:45.707634 2020-10-29 13:42:10.992583
39008 8b6b254d-4a62-4579-b640-6a17d61cc76e 2020-10-27 16:55:59.529787 2020-11-06 17:03:15.729864
39009 5e1b86e3-1f17-4413-9551-5207de5b29c5 2020-10-30 06:15:25.648940 2020-10-30 14:36:51.175908
39010 601ee268-b2da-41ea-9470-0c2574e7410c 2020-11-04 01:14:37.253225 2020-11-13 15:05:15.799022
39011 da8f8586-6ac5-40c5-bdda-8e570151f35f 2020-11-06 02:21:11.054734 2020-11-20 15:25:37.351017

18275 rows × 3 columns

In [502]:
B_num_loan = B_repay[['repayment_ts']].notnull().sum().iloc[0]
print(f"There are {B_num_loan} loans enter repayment of approvals.")	
There are 1499 loans enter repayment of approvals.
In [503]:
B_repayN = B_repay.dropna()
# Create an independent copy of the DataFrame slice
#By using A_repayN.copy(), you create a new DataFrame that is independent of the original one.
#Any modifications you make to this copy won't trigger the SettingWithCopyWarning, 
#as it's clear to Pandas that A_repayN is a standalone DataFrame.
B_repayN = B_repayN.copy()

# Now perform your operation
B_repayN['timediff'] = B_repayN['repayment_ts'] - B_repayN['awaiting_payment_ts']
B_repayN['daysdiff'] = B_repayN['timediff'].dt.days
B_repay_result = B_repayN[B_repayN['daysdiff']<=15].reset_index()
B_repay_result
Out[503]:
index loan_id awaiting_payment_ts repayment_ts timediff daysdiff
0 39 872a3fce-4e3a-4120-9803-d2ebbe3a24de 2020-10-13 17:03:37.755547 2020-10-16 14:30:20.587989 2 days 21:26:42.832442 2
1 41 1fe4e816-a795-4117-b7b6-dea4b0770efb 2020-10-13 17:07:39.660251 2020-10-23 13:57:59.591775 9 days 20:50:19.931524 9
2 58 1ce92996-7c10-4d57-a7bf-3cc7fff3d0fa 2020-10-13 19:09:41.350623 2020-10-23 13:59:15.645653 9 days 18:49:34.295030 9
3 182 b20fcf6c-7e69-4e3e-9f1a-5abc8bae5bbd 2020-10-16 03:25:53.399806 2020-10-27 13:25:50.585331 11 days 09:59:57.185525 11
4 237 b17c56d0-3be3-4f07-8eaf-a6c74f47a9c8 2020-10-15 18:47:17.028023 2020-10-23 13:59:28.740101 7 days 19:12:11.712078 7
... ... ... ... ... ... ...
1045 39007 c538afec-eabb-442c-b53f-0d8163ba679c 2020-10-24 04:56:45.707634 2020-10-29 13:42:10.992583 5 days 08:45:25.284949 5
1046 39008 8b6b254d-4a62-4579-b640-6a17d61cc76e 2020-10-27 16:55:59.529787 2020-11-06 17:03:15.729864 10 days 00:07:16.200077 10
1047 39009 5e1b86e3-1f17-4413-9551-5207de5b29c5 2020-10-30 06:15:25.648940 2020-10-30 14:36:51.175908 0 days 08:21:25.526968 0
1048 39010 601ee268-b2da-41ea-9470-0c2574e7410c 2020-11-04 01:14:37.253225 2020-11-13 15:05:15.799022 9 days 13:50:38.545797 9
1049 39011 da8f8586-6ac5-40c5-bdda-8e570151f35f 2020-11-06 02:21:11.054734 2020-11-20 15:25:37.351017 14 days 13:04:26.296283 14

1050 rows × 6 columns

In [504]:
print(f"There are {len(B_repay_result)} loans enter repayment within 15 days of approvals.")	
There are 1050 loans enter repayment within 15 days of approvals.
In [505]:
print(f"There are {A_num_loan} loans enter repayment of approvals.")	
print(f"There are {len(A_repay_result)} loans enter repayment within 15 days of approvals.")	
print(f"There are {B_num_loan} loans enter repayment of approvals.")	
print(f"There are {len(B_repay_result)} loans enter repayment within 15 days of approvals.")	
There are 1524 loans enter repayment of approvals.
There are 814 loans enter repayment within 15 days of approvals.
There are 1499 loans enter repayment of approvals.
There are 1050 loans enter repayment within 15 days of approvals.

Other findings¶

Cancellation Type Count¶

In [506]:
cancel_df = loan_df
cancel_df.groupby('cancellation_type')['cancellation_type'].count().sort_values(ascending = False).reset_index(name= 'count')
Out[506]:
cancellation_type count
0 automated.awaiting_payment 9715
1 automated.application_started 7996
2 automated.pending 4455
3 remorse 3499
4 nan 2757
5 qualify_test 2345
6 accident 2065
7 other_interests 1649
8 deposit_failure 1229
9 didnt_realize_direct_deposit 688
10 automated.verification 563
11 purchased_elsewhere 507
12 spending_limit_exceeded 505
13 magento.edit_order.cancel 437
14 duplicate_order 418
15 1_covid_elevated_risk 143
16 automated.deactivate_user_request 40
17 deny 1

Insights: the analysis of cancellation type metrics based on the cancellation_type provides valuable insights into why customers are abandoning their loan applications. Above are the results, sorted by the most frequent reasons for cancellations.¶

Action/Research: We can address the most common reasons for cancellations, like 'automated.awaiting_payment'. Cancellations at this stage could imply issues with the payment setup or customer concerns about proceeding with the loan. Streamlining the payment process(providing clear information about payment terms might help). If we address the common cancel reasons, we can streamline the loan application process, enhance customer satisfaction, and potentially increase conversion rates.¶

Approval types. count¶

In [507]:
loan_df
Out[507]:
loan_id user_id amount number_of_payments user_pinwheel_eligible_at_ap approval_type application_start_ts application_complete_ts awaiting_payment_ts repayment_ts canceled_ts cancellation_type risk_tier_at_uw
0 27f95ba4-e56f-4222-bc92-d73bf8d7d669 beca750a-2041-4c26-8154-6aa85ae9b245 471.48 8.0 0 nan 2020-10-13 00:04:43.644990 2020-10-13 00:08:29.744502 NaT NaT 2020-11-03 06:00:09.020005 automated.verification T0
1 040d9651-3ef2-46ed-9e28-6ad12bf00585 3401d86c-898b-4d6f-ba67-08b2bc005d5f 309.98 8.0 0 nan 2020-10-13 00:24:50.475201 NaT NaT NaT 2020-10-27 09:00:09.906958 automated.application_started T0
2 71a279d5-5bce-4ea2-9e98-ec9d994256b1 ef4ed8ea-17a4-4f41-a1fd-84b6fe09751f 523.98 8.0 1 underwriting.auto.ftb_decision 2020-10-13 00:26:48.749278 2020-10-13 00:27:53.698868 2020-10-13 12:19:55.282708 NaT 2021-02-11 06:30:08.285020 automated.awaiting_payment T0
3 706b5235-8424-4e91-b813-328dc5603e8e 1fa64260-49bf-474e-8800-893c0c455a06 369.96 4.0 0 nan 2020-10-13 01:08:06.763585 NaT NaT NaT 2020-10-27 09:00:08.296637 automated.application_started T0
4 d850645f-a7fe-4cd1-97db-8f1ac1364a67 90f815e9-c931-4d9e-aa79-f136bc5f227c 479.98 8.0 0 nan 2020-10-13 01:10:40.530583 NaT NaT NaT 2020-10-27 09:00:08.090965 automated.application_started T0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
39007 c538afec-eabb-442c-b53f-0d8163ba679c ff61dff9-88e4-4a85-85b9-2a912f156dc9 887.89 8.0 0 underwriting.auto.ftb_decision 2020-10-24 04:55:58.074156 2020-10-24 04:56:44.713389 2020-10-24 04:56:45.707634 2020-10-29 13:42:10.992583 NaT nan T0
39008 8b6b254d-4a62-4579-b640-6a17d61cc76e 880fb13c-70dc-4604-9199-6e158f7c4eda 847.63 16.0 1 underwriting.auto.ftb_decision 2020-10-27 16:55:35.479453 2020-10-27 16:55:59.061944 2020-10-27 16:55:59.529787 2020-11-06 17:03:15.729864 NaT nan T0
39009 5e1b86e3-1f17-4413-9551-5207de5b29c5 fc3cb0a3-a474-4d0f-9be9-e02ba5620d76 304.98 8.0 0 underwriting.auto.ftb_decision 2020-10-30 06:14:34.776961 2020-10-30 06:15:24.078435 2020-10-30 06:15:25.648940 2020-10-30 14:36:51.175908 NaT nan T0
39010 601ee268-b2da-41ea-9470-0c2574e7410c 54612ae8-64d4-426e-8c22-57d2791a0b86 478.96 16.0 1 underwriting.auto.ftb_decision 2020-11-04 01:13:43.302878 2020-11-04 01:14:35.764894 2020-11-04 01:14:37.253225 2020-11-13 15:05:15.799022 NaT nan T0
39011 da8f8586-6ac5-40c5-bdda-8e570151f35f 2ad3615a-81cf-4170-a036-c5a4c1eaeb11 647.94 8.0 1 underwriting.auto.ftb_decision 2020-11-06 02:20:08.042178 2020-11-06 02:21:09.263684 2020-11-06 02:21:11.054734 2020-11-20 15:25:37.351017 NaT nan T0

39012 rows × 13 columns

In [508]:
approve_df = loan_df
approve_df.groupby('approval_type')['approval_type'].count().sort_values(ascending = False).reset_index(name= 'count')
Out[508]:
approval_type count
0 underwriting.auto.ftb_decision 17761
1 nan 15707
2 underwriting.manual_review 2740
3 underwriting.manual.verification 1106
4 underwriting.auto.deny 920
5 underwriting.manual.approve 662
6 underwriting.manual.deny 109
7 underwriting.manual.override_approve 4
8 underwriting.auto.approve 3

Sign up date counts, over time, what is the seasoning pattern.¶

In [509]:
other = user_df[['user_id','signup_dt']]
other = other.copy()
In [510]:
# Assuming 'other' is your DataFrame and 'signup_dt' is your datetime column
other['year'] = other['signup_dt'].dt.year
other['month'] = other['signup_dt'].dt.month
other['day'] = other['signup_dt'].dt.day
other['date'] = pd.to_datetime(other[['year', 'month', 'day']])

other_n = other.groupby(['date'])['signup_dt'].count().reset_index(name = 'frequency')
other_n
Out[510]:
date frequency
0 2020-10-13 2674
1 2020-10-14 3273
2 2020-10-15 3353
3 2020-10-16 2999
4 2020-10-17 3538
5 2020-10-18 3401
6 2020-10-19 3497
7 2020-10-20 3270
8 2020-10-21 3410
9 2020-10-22 3644
10 2020-10-23 3366
11 2020-10-24 3447
12 2020-10-25 3663
13 2020-10-26 3853
14 2020-10-27 5800
15 2020-10-28 5370
16 2020-10-29 4386
17 2020-10-30 3951
18 2020-10-31 3747
19 2020-11-01 4070
20 2020-11-02 4037
21 2020-11-03 4434
22 2020-11-04 3995
23 2020-11-05 4162
24 2020-11-06 4235
In [511]:
# Create a line plot
sns.lineplot(data=other_n, x='date', y='frequency')

# Finding the highest two points
top_two = other_n.nlargest(2, 'frequency')

# Annotating the highest two points with date and frequency
for _, row in top_two.iterrows():
    label = f"{row['date'].strftime('%Y-%m-%d')}, {row['frequency']}"
    plt.annotate(label, (row['date'], row['frequency']), textcoords="offset points", xytext=(50,1), ha='center')

# Highlighting the highest two points with dots
plt.scatter(top_two['date'], top_two['frequency'], color='red', s=50)  # 's' is the size of the dot
    
    
# Enhancing the plot
plt.title('Frequency Over Time')
plt.xlabel('Date')
plt.ylabel('Frequency')
plt.xticks(rotation=45)

# Show the plot
plt.show()
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
In [512]:
##time of the day??
In [513]:
other
Out[513]:
user_id signup_dt year month day date
0 5f477f7c-3367-4bee-ba2c-f3b39a24c7cc 2020-10-13 00:06:21.603537 2020 10 13 2020-10-13
1 2752a3f2-0a96-42f9-bcc3-5fde7edec06b 2020-10-13 00:12:20.590843 2020 10 13 2020-10-13
2 7d8072a0-0c99-4eb7-8c6f-2c7d54ba52c7 2020-10-13 00:15:26.586054 2020 10 13 2020-10-13
3 0067c279-3b94-4867-ae51-aa182cfc8dc3 2020-10-13 00:17:16.041499 2020 10 13 2020-10-13
4 972e17fd-95b5-4bcc-8f17-c4323abd6c7b 2020-10-13 00:21:46.310764 2020 10 13 2020-10-13
... ... ... ... ... ... ...
95570 9d562896-402f-43fb-b2e8-95674b59c00c 2020-10-19 00:03:58.651384 2020 10 19 2020-10-19
95571 2db26318-1e86-4611-8343-f22c2f52f700 2020-10-27 17:36:53.948341 2020 10 27 2020-10-27
95572 19444a27-e573-4b06-9ebf-f49a80cb27ba 2020-11-05 13:43:05.890775 2020 11 5 2020-11-05
95573 b2cf7001-2145-4653-a58f-26f658d6fed0 2020-11-01 22:49:39.533347 2020 11 1 2020-11-01
95574 a9b1e320-847f-438f-b9c6-d3b374810d6c 2020-10-24 15:36:18.286066 2020 10 24 2020-10-24

95575 rows × 6 columns

In [514]:
# Assuming 'other' is your DataFrame and 'signup_dt' is your datetime column
other['Hour'] = other['signup_dt'].dt.hour
other['minute'] = other['signup_dt'].dt.minute
other['seconds'] = other['signup_dt'].dt.second
#other['time'] = pd.to_datetime(other[['hour', 'minute', 'seconds']])

other_t = other.groupby(['Hour'])['signup_dt'].count().reset_index(name = 'Frequency')
other_t
Out[514]:
Hour Frequency
0 0 5275
1 1 5461
2 2 5679
3 3 5220
4 4 4225
5 5 3217
6 6 1939
7 7 1892
8 8 1988
9 9 1744
10 10 1762
11 11 2074
12 12 2713
13 13 3864
14 14 4402
15 15 4614
16 16 4854
17 17 5211
18 18 5039
19 19 5079
20 20 4864
21 21 4846
22 22 4661
23 23 4952
In [515]:
# Create a line plot
sns.lineplot(data=other_t, x='Hour', y='Frequency')
plt.title('Frequency Over Hour')
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
/Users/torresliu/anaconda3/lib/python3.11/site-packages/seaborn/_oldcore.py:1119: FutureWarning: use_inf_as_na option is deprecated and will be removed in a future version. Convert inf values to NaN before operating instead.
  with pd.option_context('mode.use_inf_as_na', True):
Out[515]:
Text(0.5, 1.0, 'Frequency Over Hour')